getwd()
setwd("/media/jcolner/My Passport1/City Ordinance Files/Methodology Paper/Replication/Replication Attempt 1")
getwd()


#### Load Libraries and Data ####
library(readxl)
library(dplyr)
library(stringr)
library(tidyverse)
library(writexl)
library(ggplot2)
library(lubridate)
library(stringdist)
library(grid)

#### Chula Vista ####

# Load the data
chulavista_ODV<- read_xlsx(path="Step 4/Data Extraction/Chula Vista/Two Meetings/Chula Vista_raw.xlsx")
chulavista_SLV<- read_xlsx(path="Step 5/Legistar Data/Chula Vista Legistar.xlsx")

# Clean and Standardize Object Detection Version

chulavista_ODV <- as.data.frame(chulavista_ODV)
# get the date
chulavista_ODV$digits<- str_extract(chulavista_ODV$Day,"\\d{8}")
chulavista_ODV$date <- as.Date(chulavista_ODV$digits, format = "%m%d%Y")

# Extract text and date
chulavista_ODV_clean <- chulavista_ODV[,c("text","date","CT")]
colnames(chulavista_ODV_clean) <- c("ODV_text","date","CT")
chulavista_ODV_clean$Version <- "Object Detection"

# Group values by week and summarize
ChulaVista_CT_count <- chulavista_ODV_clean %>%
  group_by(date,CT) %>%
  summarise(items = n())

# Clean and Standardize Scraped Legistar Version
chulavista_SLV <- chulavista_SLV[,c("MatterTitle","MatterAgendaDate")]
# get the date
chulavista_SLV$digits <- str_extract(chulavista_SLV$MatterAgendaDate,"\\d{4}-\\d{2}-\\d{2}")
chulavista_SLV$date <- as.Date(chulavista_SLV$digits, format = "%Y-%m-%d")
# Extract text and date
chulavista_SLV_clean <- chulavista_SLV[,c("MatterTitle","date")]
colnames(chulavista_SLV_clean) <- c("SLV_text","date")
chulavista_SLV_clean$Version <- "Scraped Legistar"

# Group values by date and summarize
chulavista_SLV_count <- chulavista_SLV_clean %>%
  group_by(date) %>%
  summarise(items_SLV = n())

# Duplicate legistar version times 5
ChulaVista_SLV_count2 <- rbind(chulavista_SLV_count,chulavista_SLV_count,chulavista_SLV_count,chulavista_SLV_count,chulavista_SLV_count)
ChulaVista_SLV_count2$CT <- rep(c(0.5,0.6,0.7,0.8,0.9),each=length(chulavista_SLV_count$date))

# Merge legistar version and ODV version
merged_ChulaVista <- merge(ChulaVista_CT_count,ChulaVista_SLV_count2,by=c("date","CT"))

# Save excel file with merged data
write_xlsx(merged_ChulaVista[merged_ChulaVista$CT==0.8,],"Step 6/Merged Data/Chula Vista Merged.xlsx")

# Convert merged data from separate columns for type to single column with a new type indicator column
ChulaVista1 <- merged_ChulaVista[merged_ChulaVista$CT==0.8,c("items","date")]
colnames(ChulaVista1)[colnames(ChulaVista1) == "items"] <- "Items"
ChulaVista1$type <- "ODV"
ChulaVista1$city <- "Chula Vista"
ChulaVista2 <- merged_ChulaVista[merged_ChulaVista$CT==0.8,c("items_SLV","date")]
ChulaVista2$type <- "SLV"
ChulaVista2$city <- "Chula Vista"
colnames(ChulaVista2)[colnames(ChulaVista2) == "items_SLV"] <- "Items"

ChulaVista3 <- rbind(ChulaVista1,ChulaVista2)

# Create Files for Text Comparison 
SLV_Dates <- unique(chulavista_SLV_clean$date)
ODV_Dates <- unique(chulavista_ODV_clean$date)
common_dates <- SLV_Dates[SLV_Dates %in% ODV_Dates] 
set.seed(1111)
test_ChulaVista <- sample(common_dates,20)

compare_text_SLV_ChulaVista <- chulavista_SLV_clean[chulavista_SLV_clean$date %in% test_ChulaVista,]
compare_text_ODV_ChulaVista <- chulavista_ODV_clean[chulavista_ODV_clean$date %in% test_ChulaVista,]

# Save files for Text Comparison
write_xlsx(compare_text_SLV_ChulaVista,"Step 6/Data for Hand Coding/Chula Vista Compare Text SLV.xlsx")
write_xlsx(compare_text_ODV_ChulaVista[compare_text_ODV_ChulaVista$CT==0.8,],"Step 6/Data for Hand Coding/Chula Vista Compare Text ODV.xlsx")

#### Temecula ####

# Load the data
Temecula<- read_xlsx(path="Step 4/Data Extraction/Temecula/Two Meetings/Temecula_raw.xlsx")
temecula_SLV<- read_xlsx(path="Step 5/Legistar Data/Temecula Legistar.xlsx")

Temecula <- as.data.frame(Temecula)

# get the date
Temecula$digits<- str_extract(Temecula$Day,"\\d{8}")
Temecula$date <- as.Date(Temecula$digits, format = "%m%d%Y")
Temecula$digits[is.na(Temecula$digits)] <- str_extract(Temecula$Day,"\\d{6}")
Temecula$date[is.na(Temecula$date)] <- as.Date(Temecula$digits[is.na(Temecula$date)], format = "%m%d%y")

# Extract text and date
Temecula_clean <- Temecula[,c("text","date","CT")]
colnames(Temecula_clean) <- c("ODV_text","date","CT")
Temecula_clean$Version <- "Object Detection"

# Group values by week and summarize
Temecula_count <- Temecula_clean %>%
  group_by(date,CT) %>%
  summarise(items = n())

temecula_SLV <- temecula_SLV[,c("MatterTitle","MatterAgendaDate")]
# get the date
temecula_SLV$digits <- str_extract(temecula_SLV$MatterAgendaDate,"\\d{4}-\\d{2}-\\d{2}")
temecula_SLV$date <- as.Date(temecula_SLV$digits, format = "%Y-%m-%d")
# Extract text and date
temecula_SLV_clean <- temecula_SLV[,c("MatterTitle","date")]
colnames(temecula_SLV_clean) <- c("SLV_text","date")
temecula_SLV_clean$Version <- "Scraped Legistar"

# Drop duplicates based on specific columns (ID in this case)
temecula_SLV_clean <- distinct(temecula_SLV_clean, SLV_text, .keep_all = TRUE)

# Group values by date and summarize
temecula_SLV_count <- temecula_SLV_clean %>%
  group_by(date) %>%
  summarise(items_SLV = n())

# Duplicate legistar version times 5
Temecula_SLV_count2 <- rbind(temecula_SLV_count,temecula_SLV_count,temecula_SLV_count,temecula_SLV_count,temecula_SLV_count)
Temecula_SLV_count2$CT <- rep(c(0.5,0.6,0.7,0.8,0.9),each=length(temecula_SLV_count$date))

# Merge legistar version and ODV version
merged_Temecula <- merge(Temecula_count,Temecula_SLV_count2,by=c("date","CT"))

# Save excel file with merged data
write_xlsx(merged_Temecula[merged_Temecula$CT==0.5,],"Step 6/Merged Data/Temecula Merged.xlsx")

# Convert merged data from  separate columns for type to single column with a new type indicator column
Temecula1 <- merged_Temecula[merged_Temecula$CT==0.5,c("items","date")]
colnames(Temecula1)[colnames(Temecula1) == "items"] <- "Items"
Temecula1$type <- "ODV"
Temecula1$city <- "Temecula"
Temecula2 <- merged_Temecula[merged_Temecula$CT==0.5,c("items_SLV","date")]
Temecula2$type <- "SLV"
Temecula2$city <- "Temecula"
colnames(Temecula2)[colnames(Temecula2) == "items_SLV"] <- "Items"

Temecula3 <- rbind(Temecula1,Temecula2)

# Create Files for Text Comparison 
SLV_Dates <- unique(temecula_SLV_clean$date)
ODV_Dates <- unique(Temecula_clean$date)
common_dates <- SLV_Dates[SLV_Dates %in% ODV_Dates] 
set.seed(1112)
test_temecula <- sample(common_dates,20)

compare_text_SLV_temecula <- temecula_SLV_clean[temecula_SLV_clean$date %in% test_temecula,]
compare_text_ODV_temecula <- Temecula_clean[Temecula_clean$date %in% test_temecula,]

# Save files for Text Comparison
write_xlsx(compare_text_SLV_temecula,"Step 6/Data for Hand Coding/Temecula Compare Text SLV.xlsx")
write_xlsx(compare_text_ODV_temecula[compare_text_ODV_temecula$CT==0.5,],"Step 6/Data for Hand Coding/Temecula Compare Text ODV.xlsx")

#### Santa Rosa ####

# Load the data
SantaRosa<- read_xlsx(path="Step 4/Data Extraction/Santa Rosa/Two Meetings/Santa Rosa_raw.xlsx")
santarosa_SLV<- read_xlsx(path="Step 5/Legistar Data/Santa Rosa Legistar.xlsx")

SantaRosa <- as.data.frame(SantaRosa)

# get the date
SantaRosa$digits<- str_extract(SantaRosa$Day,"\\d{8}")
SantaRosa$date <- as.Date(SantaRosa$digits, format = "%m%d%Y")

# Extract text and date
SantaRosa_clean <- SantaRosa[,c("text","date","CT")]
colnames(SantaRosa_clean) <- c("ODV_text","date","CT")
SantaRosa_clean$Version <- "Object Detection"

# Group values by week and summarize
SantaRosa_count <- SantaRosa_clean %>%
  group_by(date,CT) %>%
  summarise(items = n())

santarosa_SLV <- santarosa_SLV[,c("MatterTitle","MatterAgendaDate")]

# get the date
santarosa_SLV$digits <- str_extract(santarosa_SLV$MatterAgendaDate,"\\d{4}-\\d{2}-\\d{2}")
santarosa_SLV$date <- as.Date(santarosa_SLV$digits, format = "%Y-%m-%d")

# Extract text and date
santarosa_SLV_clean <- santarosa_SLV[,c("MatterTitle","date")]
colnames(santarosa_SLV_clean) <- c("SLV_text","date")
santarosa_SLV_clean$Version <- "Scraped Legistar"

# Group values by date and summarize
santarosa_SLV_count <- santarosa_SLV_clean %>%
  group_by(date) %>%
  summarise(items_SLV = n())

# Duplicate legistar version times 5
santarosa_SLV_count2 <- rbind(santarosa_SLV_count,santarosa_SLV_count,santarosa_SLV_count,santarosa_SLV_count,santarosa_SLV_count)
santarosa_SLV_count2$CT <- rep(c(0.5,0.6,0.7,0.8,0.9),each=length(santarosa_SLV_count$date))

# Merge legistar version and ODV version
merged_SantaRosa <- merge(SantaRosa_count,santarosa_SLV_count2,by=c("date","CT"))

# Save excel file with merged data
write_xlsx(merged_SantaRosa[merged_SantaRosa$CT==0.8,],"Step 6/Merged Data/Santa Rosa Merged.xlsx")

# Convert merged data from separate columns for type to single column with a new type indicator column
SantaRosa1 <- merged_SantaRosa[merged_SantaRosa$CT==0.8,c("items","date")]
colnames(SantaRosa1)[colnames(SantaRosa1) == "items"] <- "Items"
SantaRosa1$type <- "ODV"
SantaRosa1$city <- "Santa Rosa"
SantaRosa2 <- merged_SantaRosa[merged_SantaRosa$CT==0.8,c("items_SLV","date")]
SantaRosa2$type <- "SLV"
SantaRosa2$city <- "Santa Rosa"
colnames(SantaRosa2)[colnames(SantaRosa2) == "items_SLV"] <- "Items"

SantaRosa3 <- rbind(SantaRosa1,SantaRosa2)

# Create Files for Text Comparison
SLV_Dates <- unique(santarosa_SLV_clean$date)
ODV_Dates <- unique(SantaRosa_clean$date)
common_dates <- SLV_Dates[SLV_Dates %in% ODV_Dates] 
set.seed(1112)
test_santarosa <- sample(common_dates,20)

compare_text_SLV_santarosa <- santarosa_SLV_clean[santarosa_SLV_clean$date %in% test_santarosa,]
compare_text_ODV_santarosa <- SantaRosa_clean[SantaRosa_clean$date %in% test_santarosa,]

# Save files for Text Comparison
write_xlsx(compare_text_SLV_santarosa,"Step 6/Data for Hand Coding/Santa Rosa Compare Text SLV.xlsx")
write_xlsx(compare_text_ODV_santarosa[compare_text_ODV_santarosa$CT==0.8,],"Step 6/Data for Hand Coding/Santa Rosa Compare Text ODV.xlsx")

#### Visalia ####

# Load the Object Detection Version
visalia<- read_xlsx(path="Step 4/Data Extraction/Visalia/Two Meetings/Visalia_raw.xlsx")
visalia_SLV<- read_xlsx(path="Step 5/Legistar Data/Visalia Legistar.xlsx")

Visalia <- as.data.frame(visalia)

# get the date
Visalia$digits<- str_extract(Visalia$Day,"\\d{8}")
Visalia$date <- as.Date(Visalia$digits, format = "%m%d%Y")

# Extract text and date
Visalia_clean <- Visalia[,c("text","date","CT")]
colnames(Visalia_clean) <- c("ODV_text","date","CT")
Visalia_clean$Version <- "Object Detection"

# Group values by week and summarize
Visalia_count <- Visalia_clean %>%
  group_by(date,CT) %>%
  summarise(items = n())

visalia_SLV <- visalia_SLV[,c("MatterTitle","MatterAgendaDate")]

# get the date
visalia_SLV$digits <- str_extract(visalia_SLV$MatterAgendaDate,"\\d{4}-\\d{2}-\\d{2}")
visalia_SLV$date <- as.Date(visalia_SLV$digits, format = "%Y-%m-%d")

# Extract text and date
visalia_SLV_clean <- visalia_SLV[,c("MatterTitle","date")]
colnames(visalia_SLV_clean) <- c("SLV_text","date")
visalia_SLV_clean$Version <- "Scraped Legistar"

# Group values by date and summarize
visalia_SLV_count <- visalia_SLV_clean %>%
  group_by(date) %>%
  summarise(items_SLV = n())

# Duplicate legistar version times 5
Visalia_SLV_count2 <- rbind(visalia_SLV_count,visalia_SLV_count,visalia_SLV_count,visalia_SLV_count,visalia_SLV_count)
Visalia_SLV_count2$CT <- rep(c(0.5,0.6,0.7,0.8,0.9),each=length(visalia_SLV_count$date))

# Merge legistar version and ODV version
merged_Visalia <- merge(Visalia_count,Visalia_SLV_count2,by=c("date","CT"))

# Save excel file with merged data
write_xlsx(merged_Visalia[merged_Visalia$CT==0.8,],"Step 6/Merged Data/Visalia Merged.xlsx")

# Convert merged data from separate columns for type to single column with a new type indicator column
Visalia1 <- merged_Visalia[merged_Visalia$CT==0.8,c("items","date")]
colnames(Visalia1)[colnames(Visalia1) == "items"] <- "Items"
Visalia1$type <- "ODV"
Visalia1$city <- "Visalia"
Visalia2 <- merged_Visalia[merged_Visalia$CT==0.8,c("items_SLV","date")]
Visalia2$type <- "SLV"
Visalia2$city <- "Visalia"
colnames(Visalia2)[colnames(Visalia2) == "items_SLV"] <- "Items"

Visalia3 <- rbind(Visalia1,Visalia2)

# Create Files for Text Comparison
SLV_Dates <- unique(visalia_SLV_clean$date)
ODV_Dates <- unique(Visalia_clean$date)
common_dates <- SLV_Dates[SLV_Dates %in% ODV_Dates] 
set.seed(1112)
test_visalia <- sample(common_dates,20)

compare_text_SLV_visalia <- visalia_SLV_clean[visalia_SLV_clean$date %in% test_visalia,]
compare_text_ODV_visalia <- Visalia_clean[Visalia_clean$date %in% test_visalia,]

# Save files for Text Comparison
write_xlsx(compare_text_SLV_visalia,"Step 6/Data for Hand Coding/Visalia Compare Text SLV.xlsx")
write_xlsx(compare_text_ODV_visalia[compare_text_ODV_visalia$CT==0.8,],"Step 6/Data for Hand Coding/Visalia Compare Text ODV.xlsx")

#### South San Francisco ####

# Load the Object Detection Version
SouthSanFrancisco_5<- read_xlsx(path="Step 4/Data Extraction/South San Francisco/Two Meetings/South San Francisco_raw_.5.xlsx")
SouthSanFrancisco_5 <- SouthSanFrancisco_5[SouthSanFrancisco_5$CT==0.5,]
SouthSanFrancisco_6<- read_xlsx(path="Step 4/Data Extraction/South San Francisco/Two Meetings/South San Francisco_raw_0.6.xlsx")
SouthSanFrancisco_6 <- SouthSanFrancisco_6[SouthSanFrancisco_6$CT==0.6,]
SouthSanFrancisco_7<- read_xlsx(path="Step 4/Data Extraction/South San Francisco/Two Meetings/South San Francisco_raw_0.7.xlsx")
SouthSanFrancisco_7 <- SouthSanFrancisco_7[SouthSanFrancisco_7$CT==0.7,]
SouthSanFrancisco_8<- read_xlsx(path="Step 4/Data Extraction/South San Francisco/Two Meetings/South San Francisco_raw_0.8.xlsx")
SouthSanFrancisco_8 <- SouthSanFrancisco_8[SouthSanFrancisco_8$CT==0.8,]
SouthSanFrancisco_9<- read_xlsx(path="Step 4/Data Extraction/South San Francisco/Two Meetings/South San Francisco_raw_0.9.xlsx")
SouthSanFrancisco_9 <- SouthSanFrancisco_9[SouthSanFrancisco_9$CT==0.9,]
SouthSanFrancisco <- rbind(SouthSanFrancisco_5,SouthSanFrancisco_6,SouthSanFrancisco_7,SouthSanFrancisco_8,SouthSanFrancisco_9)

# Load Legistar version
southsanfrancisco_SLV<- read_xlsx(path="Step 5/Legistar Data/South San Francisco Legistar.xlsx")

southsanfrancisco_SLV <- southsanfrancisco_SLV[,c("MatterTitle","MatterAgendaDate")]

# get the date
southsanfrancisco_SLV$digits <- str_extract(southsanfrancisco_SLV$MatterAgendaDate,"\\d{4}-\\d{2}-\\d{2}")
southsanfrancisco_SLV$date <- as.Date(southsanfrancisco_SLV$digits, format = "%Y-%m-%d")

# Extract text and date
southsanfrancisco_SLV_clean <- southsanfrancisco_SLV[,c("MatterTitle","date")]
colnames(southsanfrancisco_SLV_clean) <- c("SLV_text","date")
southsanfrancisco_SLV_clean$Version <- "Scraped Legistar"

# Group values by date and summarize
southsanfrancisco_SLV_count <- southsanfrancisco_SLV_clean %>%
  group_by(date) %>%
  summarise(items_SLV = n())

SouthSanFrancisco <- as.data.frame(SouthSanFrancisco)

# get the date ODV
SouthSanFrancisco$digits<- str_extract(SouthSanFrancisco$Day,"\\d{8}")
SouthSanFrancisco$date <- as.Date(SouthSanFrancisco$digits, format = "%m%d%Y")

# Extract text and date ODV
SouthSanFrancisco_clean <- SouthSanFrancisco[,c("text","date","CT")]
colnames(SouthSanFrancisco_clean) <- c("ODV_text","date","CT")
SouthSanFrancisco_clean$Version <- "Object Detection"

# Group values by week and summarize
SouthSanFrancisco_count <- SouthSanFrancisco_clean %>%
  group_by(date,CT) %>%
  summarise(items = n())

# Duplicate legistar version times 5
SouthSanFrancisco_SLV_count2 <- rbind(southsanfrancisco_SLV_count,southsanfrancisco_SLV_count,southsanfrancisco_SLV_count,southsanfrancisco_SLV_count,southsanfrancisco_SLV_count)
SouthSanFrancisco_SLV_count2$CT <- rep(c(0.5,0.6,0.7,0.8,0.9),each=length(southsanfrancisco_SLV_count$date))

# Merge legistar version and ODV version
merged_SouthSanFrancisco <- merge(SouthSanFrancisco_count,SouthSanFrancisco_SLV_count2,by=c("date","CT"))

# Save excel file with merged data
write_xlsx(merged_SouthSanFrancisco[merged_SouthSanFrancisco$CT==0.8,],"Step 6/Merged Data/South San Francisco Merged.xlsx")

# Convert merged data from separate columns for type to single column with a new type indicator column
SouthSanFrancisco1 <- merged_SouthSanFrancisco[merged_SouthSanFrancisco$CT==0.8,c("items","date")]
colnames(SouthSanFrancisco1)[colnames(SouthSanFrancisco1) == "items"] <- "Items"
SouthSanFrancisco1$type <- "ODV"
SouthSanFrancisco1$city <- "South San Francisco"
SouthSanFrancisco2 <- merged_SouthSanFrancisco[merged_SouthSanFrancisco$CT==0.8,c("items_SLV","date")]
SouthSanFrancisco2$type <- "SLV"
SouthSanFrancisco2$city <- "South San Francisco"
colnames(SouthSanFrancisco2)[colnames(SouthSanFrancisco2) == "items_SLV"] <- "Items"

SouthSanFrancisco3 <- rbind(SouthSanFrancisco1,SouthSanFrancisco2)

# Create Files for Text Comparison 
SLV_Dates <- unique(southsanfrancisco_SLV_clean$date)
ODV_Dates <- unique(SouthSanFrancisco_clean$date)
common_dates <- SLV_Dates[SLV_Dates %in% ODV_Dates] 
set.seed(1113)
test_southsanfrancisco <- sample(common_dates,20)

compare_text_SLV_southsanfrancisco <- southsanfrancisco_SLV_clean[southsanfrancisco_SLV_clean$date %in% test_southsanfrancisco,]
compare_text_ODV_southsanfrancisco <- SouthSanFrancisco_clean[SouthSanFrancisco_clean$date %in% test_southsanfrancisco,]

# Save files for Text Comparison
write_xlsx(compare_text_SLV_southsanfrancisco,"Step 6/Data for Hand Coding/South San Francisco Compare Text SLV.xlsx")
write_xlsx(compare_text_ODV_southsanfrancisco[compare_text_ODV_southsanfrancisco$CT==0.8,],"Step 6/Data for Hand Coding/South San Francisco Compare Text ODV.xlsx")

#### Appendix Figure 7 ####

# Merge across cities
merge_final <- rbind(SantaRosa3,Temecula3,ChulaVista3,SouthSanFrancisco3,Visalia3)
merge_final <- merge_final[!is.na(merge_final$date),]
merge_final$date <- as.Date(merge_final$date)

# Test correlation score
santarosa_cor <- cor(merge_final$Items[merge_final$city=="Santa Rosa" & merge_final$type=="ODV"],
                     merge_final$Items[merge_final$city=="Santa Rosa" & merge_final$type=="SLV"])

# Create dataframe with the correlation between SLV and ODV for each city
dat_text <- data.frame(values = c(round(cor(merge_final$Items[merge_final$city=="Santa Rosa" & merge_final$type=="ODV"],
                                            merge_final$Items[merge_final$city=="Santa Rosa" & merge_final$type=="SLV"]),2),
                                  round(cor(merge_final$Items[merge_final$city=="Temecula" & merge_final$type=="ODV"],
                                            merge_final$Items[merge_final$city=="Temecula" & merge_final$type=="SLV"]),2),
                                  round(cor(merge_final$Items[merge_final$city=="Chula Vista" & merge_final$type=="ODV"],
                                            merge_final$Items[merge_final$city=="Chula Vista" & merge_final$type=="SLV"]),2),
                                  round(cor(merge_final$Items[merge_final$city=="South San Francisco" & merge_final$type=="ODV"],
                                            merge_final$Items[merge_final$city=="South San Francisco" & merge_final$type=="SLV"]),2),
                                  round(cor(merge_final$Items[merge_final$city=="Visalia" & merge_final$type=="ODV"],
                                            merge_final$Items[merge_final$city=="Visalia" & merge_final$type=="SLV"]),2)),
                       city=c("Santa Rosa","Temecula","Chula Vista","South San Francisco","Visalia"),
                       Items = c(30,35,30,10,30),
                       date=as.Date(c("2015-01-01","2020-01-01","2015-11-01","2017-11-01","2013-11-01")),
                       type=rep("ODV",5))

# Create text labels out of correlation from dat_text
dat_text$values2 <- paste("Correlation =",dat_text$values)

# black and white version of figure
ggplot(merge_final,aes(x = date, y = Items, group=type)) +
                          geom_smooth(aes(linetype=type),color="black") +
                          labs(title = "Number of Agenda Items Over Time",
                               x = "",
                               y = "Agenda Items") +
                          theme_bw()+facet_wrap(.~city,scales = "free_x",ncol=2)+
                          theme(axis.line = element_line(colour = "black"),
                                panel.grid.major = element_blank(),
                                panel.grid.minor = element_blank(),
                                panel.border = element_blank(),
                                panel.background = element_blank(),
                                axis.ticks.y = element_blank(),
                                axis.text=element_text(size=12,color="black"),
                                axis.title=element_text(size=18),
                                plot.title = element_text(size=18),
                                legend.text=element_text(size=14),
                                legend.position = "bottom",
                                legend.direction='vertical',
                                strip.text = element_text(size = 14),
                                strip.background = element_rect(colour="white", fill="white"))+
                          guides(linetype = guide_legend(title = ""))+
                          scale_linetype_manual(labels = c("Object Detection Method", "Legistar Scraping Method"),values=c("solid","dotted")) +
                          scale_x_date()+
                          geom_text(data=dat_text,aes(x = date, y = Items, label = values2),color="black",size=4.5)

#Save figure
ggsave("Final Figures/Appendix Figure 7.png",width=10,height=6)

#### Appendix A: Average Precision ####
#Extract the evaluation of the model performance for use in appendix A
Temecula_eval<- read.csv("Step 3/Model Output/Temecula/Two Meetings/From Manuscript/eval.csv")
Temecula_eval[1,2]
SantaRosa_eval<- read.csv("Step 3/Model Output/Santa Rosa/Two Meetings/From Manuscript/eval.csv")
SantaRosa_eval[1,2]
Visalia_eval<- read.csv("Step 3/Model Output/Visalia/Two Meetings/From Manuscript/eval.csv")
Visalia_eval[1,2]
ChulaVista_eval<- read.csv("Step 3/Model Output/Chula Vista/Two Meetings/From Manuscript/eval.csv")
ChulaVista_eval[1,2]
SSF_eval<- read.csv("Step 3/Model Output/South San Francisco/Two Meetings/From Manuscript/eval.csv")
SSF_eval[1,2]

# compare average precision to correlation score for use in appendix A
dat_text$OD_Precision <- c(SantaRosa_eval[1,2],Temecula_eval[1,2],ChulaVista_eval[1,2],SSF_eval[1,2],Visalia_eval[1,2])
cor(dat_text[,c("values","OD_Precision")])
cor(dat_text[-5,c("values","OD_Precision")])

#### Appendix C: Figure 3 ####

# Calculate correlation between SLV and ODV data across Confidence Thresholds
SantaRosa_cor <- merged_SantaRosa %>%
  group_by(CT) %>%
  summarize(correlation = cor(items, items_SLV))

SantaRosa_cor$city <- "Santa Rosa"

# Calculate correlation between SLV and ODV data across Confidence Thresholds
SouthSanFrancisco_cor <- merged_SouthSanFrancisco %>%
  group_by(CT) %>%
  summarize(correlation = cor(items, items_SLV))

SouthSanFrancisco_cor$city <- "South San Francisco"

# Calculate correlation between SLV and ODV data across Confidence Thresholds
Visalia_cor <- merged_Visalia %>%
  group_by(CT) %>%
  summarize(correlation = cor(items, items_SLV))

Visalia_cor$city <- "Visalia"

# Calculate correlation between SLV and ODV data across Confidence Thresholds
Temecula_cor <- merged_Temecula %>%
  group_by(CT) %>%
  summarize(correlation = cor(items, items_SLV))

Temecula_cor$city <- "Temecula"

# Calculate correlation between SLV and ODV data across Confidence Thresholds
ChulaVista_cor <- merged_ChulaVista %>%
  group_by(CT) %>%
  summarize(correlation = cor(items, items_SLV))

ChulaVista_cor$city <- "Chula Vista"

# Collect those correlation scores across cities
cor_mat <- rbind(SantaRosa_cor,SouthSanFrancisco_cor,Visalia_cor,Temecula_cor,ChulaVista_cor)

# Generate appendix figure 3 
ggplot(cor_mat,aes(x = CT, y = correlation, color=city,linetype=city)) +
  geom_point()+ geom_line() +
  labs(title = "Accuracy over Confidence Thresholds",
       x = "Confidence Threshold Used",
       y = "Correlation with Legistar") +
  theme_bw()+
  theme(axis.line = element_line(colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text=element_text(size=12,color="black"),
        axis.title=element_text(size=18),
        plot.title = element_text(size=18),
        legend.text=element_text(size=14),
        legend.position = "bottom",
        legend.direction='vertical',
        strip.text = element_text(size = 14),
        strip.background = element_rect(colour="white", fill="white"),
        legend.key.height= unit(.2, 'cm'),
        legend.key.width= unit(2, 'cm'))+
  guides(color = guide_legend(ncol=2),linetype=guide_legend(ncol=2))+
  labs(color  = "", linetype = "")+
  ylim(0,1)+
  scale_color_manual(values = c("black","black","black","grey","grey"))+
  scale_linetype_manual(values = c("solid","dotted","dashed","solid","dashed"))

# save figure
ggsave("Final Figures/Appendix Figure 3.png",width=10,height=6)

#### Appendix D: Figure 4 ####

# Create empty dataframe to populate with average precision across 25 models
cities <- c("Chula Vista","South San Francisco","Temecula", "Visalia", "Santa Rosa")
eval2 <- data.frame(cities = rep(cities,5), ap = rep(NA,25),
                    type=c(rep(c("One Meeting","Two Meetings","Three Meetings","Four Meetings","Five Meetings"),
                               each=5)),number=rep(NA,25))

eval2$type_num <- rep(c(1,2,3,4,5),each=5)

counts <- c("One Meeting","Two Meetings","Three Meetings","Four Meetings","Five Meetings")

# extract average precision from each model evaluation
for (city in cities){
  for (count in counts){
    file_path <- paste("Step 3/Model Output/",city,"/",count,"/From Manuscript/eval.csv",sep="") 
    # Read the CSV file
    csv_data <- read.csv(file = file_path)
    value <- csv_data[1,2]
    eval2$ap[eval2$cities==city & eval2$type==count] <- value
  }
}

# manually copy number of training and testing images for each model
eval2$number <- c(60,41,69,20,87,100,87,151,42,167,143,133,235,66,264,182,184,339,86,342,223,226,436,112,432)

# The number of Meetings collected for each year
order_levels <- c("One Meeting","Two Meetings","Three Meetings","Four Meetings","Five Meetings")

eval2$type <- factor(eval2$type,levels = order_levels,ordered=TRUE)

# Create appendix figure 4
ggplot(eval2,aes(x = number, y = ap, color=cities,linetype=cities)) +
  geom_point(size=2)+ geom_line(linewidth=1.5) +
  labs(title = "Average Precision by Training Sample",
       x = "# of Images in Training Sample",
       y = "Average Precision") +
  theme_bw()+
  theme(axis.line = element_line(colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text=element_text(size=12,color="black"),
        axis.title=element_text(size=18),
        plot.title = element_text(size=18),
        legend.text=element_text(size=14),
        legend.position = "bottom",
        legend.direction='vertical',
        strip.text = element_text(size = 14),
        strip.background = element_rect(colour="white", fill="white"),
        legend.key.height= unit(.2, 'cm'),
        legend.key.width= unit(2, 'cm'))+
  guides(color = guide_legend(ncol=2),linetype=guide_legend(ncol=2))+
  scale_color_manual(labels = c("Chula Vista", "South San Francisco","Temecula", "Visalia", "Santa Rosa"),
                     values = c("black","black","black","grey","grey"))+
  scale_linetype_manual(labels = c("Chula Vista", "South San Francisco","Temecula", "Visalia", "Santa Rosa"),
                        values = c("solid","dotted","dashed","solid","dashed"))+
  labs(color  = "", linetype = "")+
  ylim(50,100)

# Save figure
ggsave("Final Figures/Appendix Figure 4.png",width=10,height=6)




